Requirements

NOTE: This analysis requires at least 10Gb of RAM to run.

Parameters

input_folder <- "raw_input"
output_folder <- "results"
output_format <- ".pdf"

Parsing taxonomic data

The code below downloads the compressed data to a temporary directory:

library(metacoder)
data_url <- "http://taraoceans.sb-roscoff.fr/EukDiv/data/Database_W5_OTU_occurences.tsv.zip"
temp_dir_path <- tempdir()
local_file_path <- file.path(temp_dir_path, basename(data_url))
download.file(url = data_url, destfile = local_file_path, quiet = TRUE)

Next we will uncompress the archive and identify the fasta file.

# Get contents of zip archive
unpacked_file_paths <- unzip(local_file_path, list = TRUE)
# Uncompress archive
unzip(local_file_path, exdir = temp_dir_path)
# Identify the Mothur RDP training set
unpacked_file_path <- file.path(temp_dir_path, 
                                unpacked_file_paths[grepl("tsv$", unpacked_file_paths)])
data <- parse_taxonomy_table(unpacked_file_path, taxon_col = c("class" = -9), class_sep = "\\|")

Getting sample data

data_url <- "http://taraoceans.sb-roscoff.fr/EukDiv/data/Database_W1_Sample_parameters.xls"
temp_dir_path <- tempdir()
local_file_path <- file.path(temp_dir_path, basename(data_url))
download.file(url = data_url, destfile = local_file_path, quiet = TRUE)
sample_data <- readxl::read_excel(local_file_path)
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00 
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00 
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00 
## DEFINEDNAME: 00 00 00 1d 0b 00 00 00 01 00 00 00 00 00 00 45 78 63 65 6c 5f 42 75 69 6c 74 49 6e 5f 5f 46 69 6c 74 65 72 44 61 74 61 62 61 73 65 3b 00 00 00 00 4e 01 00 00 1c 00

Caluculate read abundance per taxon

sample_columns <- sample_data[["PANGAEA ACCESSION NUMBER"]]
data <- mutate_taxa(data,
                    read_counts = vapply(obs(data),
                                         function(x) sum(data$obs_data$totab[x]), numeric(1)))

Plot read and OTU abundance

seed = 9 #9, 10, 12 is good
set.seed(seed)
taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
background_color <- "#ccfff7"
data %>%
  filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
  filter_taxa(read_counts >= 100) %>%
  filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
  heat_tree(title = "Plankton diversity in the sunlit ocean",
            title_size = 0.03,
            node_color_axis_label = "Number of reads (Abundance)",
            node_size_axis_label = "Number of species (OTUs)",
            node_size = n_obs,
            node_size_range = c(0.0012, NA),
            node_color = read_counts,
            node_color_range = c("grey", "#80cdc1", "#018571", "#dfc27d", "#a6611a"),
            node_color_trans = "log10",
            node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
                                  ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)) &
                                  read_counts > 10000,
                                name, NA),
            node_label_color = "#000000",
            node_label_color_trans = "area",
            node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
            node_label_size_trans = "area",
            node_label_size_range = c(0.001, NA),
            node_label_max = 1000,
            tree_label = name,
            tree_label_color = "#00806c",
            tree_label_max = 100,
            initial_layout = "re", layout = "da",
            overlap_avoidance = .65,
            background_color = background_color,
            maxiter = 50, fineiter = 50,
            margin_size = c(0.001, 0.001),
            output_file = file.path(output_folder, paste0("sup_figure_1--tara_all_plankton_diversity",  output_format)))

Plot propotion of OTUs identified

data <- mutate_obs(data, pid = as.numeric(pid))
data <- mutate_taxa(data, mean_pid = vapply(obs(data),
                                            function(x) mean(data$obs_data$pid[x], na.rm = TRUE), numeric(1)))
# Percentage of OTUs with less than 90% idententiy
data <- mutate_taxa(data, percent_known = vapply(obs(data),
                                                 function(x) sum(data$obs_data$pid[x] >= 90, na.rm = TRUE) / length(x) * 100, numeric(1)))
seed = 1
set.seed(seed)
taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
taxa_patterns_to_remove <- paste0("^", c("X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
data %>%
  filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
  filter_taxa(read_counts >= 100) %>%
  filter_taxa(!(taxon_ids %in% subtaxa(data, name == "Eukaryota", simplify = TRUE, include_input = TRUE) & n_supertaxa <= 1)) %>%
  heat_tree(title = "Poportion of OTUs not well identified",
            title_size = 0.03,
            node_color_axis_label = "Percent of species (OTUs) identified",
            node_size_axis_label = "Number of species (OTUs)",
            node_size = n_obs,
            node_size_range = c(0.0012, NA),
            node_color = percent_known,
            node_color_range = c("red", "orange", "yellow", "green", "cyan"),
            node_color_trans = "linear",
            node_color_interval = c(0, 100),
            node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
                                  ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
                                name, NA),
            node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
            node_label_size_trans = "area",
            node_label_size_range = c(0.001, NA),
            node_label_max = 1000,
            tree_label = name,
            tree_label_max = 100,
            initial_layout = "re", layout = "da",
            overlap_avoidance = .65,
            maxiter = 50, fineiter = 50,
            margin_size = c(0.001, 0.001),
            output_file = file.path(output_folder, paste0("sup_figure_2--tara_proportion_identified",  output_format)))

Just Metazoa

set.seed(64)
taxa_patterns_to_hide <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
taxa_patterns_to_remove <- paste0("^", c("[X]+", "X\\+sp\\.", "NA", "root", "\\*", "sp\\.", "sp"), "$")
data %>%
  filter_taxa(! Reduce(`|`, lapply(taxa_patterns_to_remove, grepl, x = name))) %>%
  filter_taxa(read_counts >= 200) %>%
  filter_taxa(name == "Metazoa", subtaxa = TRUE) %>%
  heat_tree(node_color_axis_label = "Percent of OTUs identified",
            edge_color_axis_label = "Percent of OTUs identified",
            edge_size = read_counts,
            edge_size_range = c(0.001, 0.01),
            edge_color = percent_known,
            edge_color_range = c("red", "orange", "yellow", "greenyellow", "green"),
            edge_color_trans = "linear",
            node_size_axis_label = "Number of OTUs",
            edge_size_axis_label = "Number of reads",
            node_size = n_obs,
            node_size_range = c(0.005, 0.03),
            node_color = percent_known,
            node_color_range = c("red", "orange", "yellow", "greenyellow", "green"),
            node_color_trans = "linear",
            node_color_interval = c(0, 100),
            node_label = ifelse(grepl(pattern = "^[a-zA-z\\-]{1,25}$", name) &
                                  ! Reduce(`|`, lapply(taxa_patterns_to_hide, grepl, x = name)),
                                name, NA),
            node_label_size = (n_obs / (n_supertaxa + 1)) ^ 0.5,
            node_label_size_range = c(0.008, 0.012),
            node_label_size_trans = "area",
            node_label_max = 1000,
            initial_layout = "fr", layout = "da",
            overlap_avoidance = .8,
            maxiter = 50, fineiter = 50,
            weight.node.edge.dist = 15,
            output_file = file.path(output_folder,
                                    paste0("figure_1--tara_metazoa--seed_",
                                           seed, output_format)))